import pandas as pd
import numpy as np
import sweetviz as sv
import os
os.getcwd()
'c:\\Users\\jhonloaiza\\Documents\\propio\\Master Uoc\\Semestre II\\Visualizacion datos\\PRA'
#se construye una lista con las prosibles formas de NA en el conjunto de datos
missing_values = ["n/a", "na", "--","NA"," ","N/A","Na"]
path = r'C:\Users\jhonloaiza\Documents\propio\Master Uoc\Semestre II\Visualizacion datos\PEC\PEC2\pax_data_1868_agreements_14-04-21.csv'
df = pd.read_csv(path, na_values = missing_values)
# Analizamos la data
report=sv.analyze(df, pairwise_analysis="off")
# Generating report
report.show_notebook()
Feature: ImSrc |██████████| [100%] 00:55 -> (00:00 left)
df.head()
| Con | Contp | PP | PPName | Reg | AgtId | Ver | Agt | Dat | Status | ... | TjRep | TjRSym | TjRMa | TjNR | ImUN | ImOth | ImRef | ImPK | ImE | ImSrc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | Government/territory | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 2232 | 3 | Resolution of Intra Afghan Peace Conference in... | 2019-07-08 | Multiparty signed/agreed | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | Afghanistan | Government | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 1739 | 2 | Agreement between the Islamic Republic of Afgh... | 2016-09-22 | Multiparty signed/agreed | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2 | Afghanistan | Government | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 1923 | 2 | Agreement between the two campaign teams regar... | 2014-09-21 | Multiparty signed/agreed | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 3 | Afghanistan | Government | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 864 | 1 | Tokyo Declaration Partnership for Self-Relianc... | 2012-07-08 | Multiparty signed/agreed | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
| 4 | Afghanistan | Government | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 848 | 1 | Conclusions of the Conference on Afghanistan a... | 2011-12-05 | Multiparty signed/agreed | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
5 rows × 267 columns
df_
df_final = df[['Con','Contp','PPName','Reg','Agt','AgtId','Dat','Status','Agtp','Stage','GCh','GDis','GAge','GMig','GRa','GRe','GInd','GIndOth','GRef','GSoc']]
df_final.to_csv('df_acuerdos_practica.csv', index=False)
Para este apartado se usaran solo algunos campos del dataset, los que parezcan ser relevantes para el objetivo de familiarizarnos con la data en particular
df.describe()
| PP | AgtId | Ver | Lgt | N_characters | Loc1GWNO | Loc2GWNO | UcdpAgr | PamAgr | CowWar | ... | TjRep | TjRSym | TjRMa | TjNR | ImUN | ImOth | ImRef | ImPK | ImE | ImSrc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1868.000000 | 1868.000000 | 1868.000000 | 1868.000000 | 1868.000000 | 1857.000000 | 246.000000 | 2.970000e+02 | 33.000000 | 1115.000000 | ... | 1868.000000 | 1868.000000 | 1868.000000 | 1868.000000 | 1868.000000 | 1868.000000 | 1868.000000 | 1868.000000 | 1868.000000 | 1868.0 |
| mean | 84.870450 | 1155.105996 | 1.249465 | 6.274625 | 11951.505353 | 528.693592 | 439.719512 | 2.903701e+10 | 17.272727 | 759.324664 | ... | 0.177195 | 0.011242 | 0.102248 | 0.300321 | 0.193790 | 0.238758 | 0.009636 | 0.291756 | 0.383833 | 1.0 |
| std | 42.273883 | 663.722169 | 0.590838 | 17.194469 | 37644.455332 | 223.469441 | 195.075032 | 3.602522e+11 | 10.022702 | 254.762977 | ... | 0.553679 | 0.105459 | 0.303056 | 0.608157 | 0.395372 | 0.426439 | 0.097715 | 0.454692 | 0.486448 | 0.0 |
| min | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 329.000000 | 41.000000 | 2.000000 | 2.400000e+01 | 1.000000 | 139.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 |
| 25% | 55.000000 | 568.750000 | 1.000000 | 1.000000 | 2397.250000 | 365.000000 | 346.250000 | 1.107000e+03 | 9.000000 | 817.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 |
| 50% | 90.000000 | 1135.500000 | 1.000000 | 2.000000 | 4354.000000 | 531.000000 | 371.000000 | 1.334000e+03 | 17.000000 | 857.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 |
| 75% | 123.000000 | 1736.250000 | 1.000000 | 5.000000 | 9420.500000 | 678.000000 | 625.000000 | 1.523000e+03 | 26.000000 | 905.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.0 |
| max | 160.000000 | 2317.000000 | 4.000000 | 323.000000 | 908459.000000 | 940.000000 | 770.000000 | 5.171991e+12 | 34.000000 | 941.000000 | ... | 3.000000 | 1.000000 | 1.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.0 |
8 rows × 250 columns
df['Counts'] = df.groupby(['Con'])['Con'].transform('count')
df['Counts']
0 21
1 21
2 21
3 21
4 21
..
1863 50
1864 50
1865 3
1866 3
1867 3
Name: Counts, Length: 1868, dtype: int64
Identificamos los paises que poseen acuerdos, solo tomamos el primer pais como referencia para construir la variable "Con1"
#definimos la forma de hacer el split
new = df["Con"].str.split("/", n = 1, expand = True)
#separamos el primer pais
df["Con1"]= new[0]
#separamos el segundo pais
df["Con2"]= new[1]
# df display
df.head()
| Con | Contp | PP | PPName | Reg | AgtId | Ver | Agt | Dat | Status | ... | TjNR | ImUN | ImOth | ImRef | ImPK | ImE | ImSrc | Counts | Con1 | Con2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | Government/territory | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 2232 | 3 | Resolution of Intra Afghan Peace Conference in... | 2019-07-08 | Multiparty signed/agreed | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 21 | Afghanistan | None |
| 1 | Afghanistan | Government | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 1739 | 2 | Agreement between the Islamic Republic of Afgh... | 2016-09-22 | Multiparty signed/agreed | ... | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 21 | Afghanistan | None |
| 2 | Afghanistan | Government | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 1923 | 2 | Agreement between the two campaign teams regar... | 2014-09-21 | Multiparty signed/agreed | ... | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 21 | Afghanistan | None |
| 3 | Afghanistan | Government | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 864 | 1 | Tokyo Declaration Partnership for Self-Relianc... | 2012-07-08 | Multiparty signed/agreed | ... | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 21 | Afghanistan | None |
| 4 | Afghanistan | Government | 2 | Afghanistan: 2000s Post-intervention process | Europe and Eurasia | 848 | 1 | Conclusions of the Conference on Afghanistan a... | 2011-12-05 | Multiparty signed/agreed | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 21 | Afghanistan | None |
5 rows × 270 columns
df['Counts'] = df.groupby(['Con1'])['Con1'].transform('count')
df['Counts']
0 24
1 24
2 24
3 24
4 24
..
1863 50
1864 50
1865 3
1866 3
1867 3
Name: Counts, Length: 1868, dtype: int64
Obsevemos ahora los valores perdidos de las variables del dataset
prop_perdidos = df.isnull().sum() * 100 / len(df)
valores_perdidos_df = pd.DataFrame({'Variable': df.columns,
'% de perdidos': prop_perdidos})
valores_perdidos_df
| Variable | % de perdidos | |
|---|---|---|
| Con | Con | 0.000000 |
| Contp | Contp | 0.000000 |
| PP | PP | 0.000000 |
| PPName | PPName | 0.000000 |
| Reg | Reg | 0.000000 |
| ... | ... | ... |
| ImE | ImE | 0.000000 |
| ImSrc | ImSrc | 0.000000 |
| Counts | Counts | 0.000000 |
| Con1 | Con1 | 0.000000 |
| Con2 | Con2 | 53.104925 |
270 rows × 2 columns
# Variables numericas
data = df.select_dtypes(include = ["number"])
#guardamos los nombres de las columnas para usarlas posteriormente
#lista_cols_num = list(data.columns.values)
#extraemos el array con los valores de las variables escogidas
#data = data.values
#data.shape
data.describe().transpose().head(50)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| PP | 1868.0 | 8.487045e+01 | 4.227388e+01 | 1.0 | 55.00 | 90.0 | 123.00 | 1.600000e+02 |
| AgtId | 1868.0 | 1.155106e+03 | 6.637222e+02 | 1.0 | 568.75 | 1135.5 | 1736.25 | 2.317000e+03 |
| Ver | 1868.0 | 1.249465e+00 | 5.908377e-01 | 1.0 | 1.00 | 1.0 | 1.00 | 4.000000e+00 |
| Lgt | 1868.0 | 6.274625e+00 | 1.719447e+01 | 1.0 | 1.00 | 2.0 | 5.00 | 3.230000e+02 |
| N_characters | 1868.0 | 1.195151e+04 | 3.764446e+04 | 329.0 | 2397.25 | 4354.0 | 9420.50 | 9.084590e+05 |
| Loc1GWNO | 1857.0 | 5.286936e+02 | 2.234694e+02 | 41.0 | 365.00 | 531.0 | 678.00 | 9.400000e+02 |
| Loc2GWNO | 246.0 | 4.397195e+02 | 1.950750e+02 | 2.0 | 346.25 | 371.0 | 625.00 | 7.700000e+02 |
| UcdpAgr | 297.0 | 2.903701e+10 | 3.602522e+11 | 24.0 | 1107.00 | 1334.0 | 1523.00 | 5.171991e+12 |
| PamAgr | 33.0 | 1.727273e+01 | 1.002270e+01 | 1.0 | 9.00 | 17.0 | 26.00 | 3.400000e+01 |
| CowWar | 1115.0 | 7.593247e+02 | 2.547630e+02 | 139.0 | 817.00 | 857.0 | 905.00 | 9.410000e+02 |
| GCh | 1868.0 | 3.179872e-01 | 7.735295e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 3.000000e+00 |
| GChRhet | 1868.0 | 8.083512e-02 | 2.726547e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GChAntid | 1868.0 | 3.211991e-03 | 5.659849e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GChSubs | 1868.0 | 1.065310e-01 | 3.085987e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GChOth | 1868.0 | 1.124197e-02 | 1.054587e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GDis | 1868.0 | 1.381156e-01 | 5.546541e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 3.000000e+00 |
| GDisRhet | 1868.0 | 2.034261e-02 | 1.412072e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GDisAntid | 1868.0 | 9.635974e-03 | 9.771506e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GDisSubs | 1868.0 | 4.871520e-02 | 2.153296e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GDisOth | 1868.0 | 2.676660e-03 | 5.168099e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GAge | 1868.0 | 9.100642e-02 | 4.454910e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 3.000000e+00 |
| GAgeRhet | 1868.0 | 1.766595e-02 | 1.317694e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GAgeAntid | 1868.0 | 7.494647e-03 | 8.626970e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GAgeSubs | 1868.0 | 2.997859e-02 | 1.705739e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GAgeOth | 1868.0 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.00 | 0.0 | 0.00 | 0.000000e+00 |
| GMig | 1868.0 | 2.623126e-02 | 2.424154e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 3.000000e+00 |
| GMigRhet | 1868.0 | 2.676660e-03 | 5.168099e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GMigAntid | 1868.0 | 5.353319e-04 | 2.313724e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GMigSubs | 1868.0 | 9.635974e-03 | 9.771506e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GMigOth | 1868.0 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.00 | 0.0 | 0.00 | 0.000000e+00 |
| GRa | 1868.0 | 3.763383e-01 | 8.403439e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 3.000000e+00 |
| GRaRhet | 1868.0 | 1.043897e-01 | 3.058473e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GRaAntid | 1868.0 | 5.513919e-02 | 2.283129e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GRaSubs | 1868.0 | 1.017131e-01 | 3.023515e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GRaOth | 1868.0 | 5.353319e-03 | 7.298981e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GRe | 1868.0 | 2.542827e-01 | 7.171537e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 3.000000e+00 |
| GReRhet | 1868.0 | 5.192719e-02 | 2.219395e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GReAntid | 1868.0 | 4.764454e-02 | 2.130700e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GReSubs | 1868.0 | 6.905782e-02 | 2.536203e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GReOth | 1868.0 | 7.494647e-03 | 8.626970e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GInd | 1868.0 | 1.279443e-01 | 5.360198e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 3.000000e+00 |
| GIndRhet | 1868.0 | 2.997859e-02 | 1.705739e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GIndAntid | 1868.0 | 3.211991e-03 | 5.659849e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GIndSubs | 1868.0 | 4.282655e-02 | 2.025201e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GIndOth | 1868.0 | 1.605996e-03 | 4.005341e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GOth | 1868.0 | 5.781585e-02 | 3.628105e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 3.000000e+00 |
| GOthRhet | 1868.0 | 7.494647e-03 | 8.626970e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GOthAntid | 1868.0 | 1.338330e-02 | 1.149402e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GOthSubs | 1868.0 | 1.231263e-02 | 1.103066e-01 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
| GOthOth | 1868.0 | 5.353319e-04 | 2.313724e-02 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000000e+00 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1868 entries, 0 to 1867 Columns: 251 entries, PP to Counts dtypes: float64(5), int64(246) memory usage: 3.6 MB
df_final.head
[['con', 'contp', 'PPName', 'Reg', 'Agt', 'Dat', 'Status', 'Agtp', 'Stage', 'GCh', 'GDis', 'Gage', 'GMig', 'GRa', 'Gre', 'Gind', 'GIndOth', 'Gref', 'Gsoc', 'Conflict']]